Ferritin ID Thresholds

FIGUREPATH <- "~/who_ferritin_comment/results/figures/"
BOOTPATH <- "~/who_ferritin_comment/results/boot/"
DATAPATH <- "~/who_ferritin_comment/data/"
MODELPATH <- "~/who_ferritin_comment/results/models/"
knots <- 3:25
response <- "Hemoglobin"
zoom <- T
boot_n <- 1000
Libraries
source("~/who_ferritin_comment/src/functions.R")
library(dplyr)
library(rms)
library(mcp)
library(boot)
library(scam)

Load data

# Load data
# Naming assumptions:
# Ferritin, Hemoglobin, sTfR
data <- readRDS(paste0(DATAPATH, "NHANES_variables_filtered.rds"))

Test RCS model selection

(If plots are too small due to layout settings, you may change the chunk option layout-ncol: 4 to 3, 2, or 1. You may also view these plots by accessing your FIGUREPATH, or by right-clicking and selecting Open image in new tab.)

Visualize RCS knots
data_full <- data
data <- data %>% select(Ferritin, as.name(response))
# set datadist
dd <- datadist(data)
options(datadist = "dd")

investigate_RCS(data, response, knots, zoom, FIGUREPATH)

Bootstrap knot selection

Select optimum number of knots via bootstrapping and AIC
# The function boot_knot_selection is defined in functions.R
if (!file.exists(paste0(BOOTPATH, response, "_rcs_knots_", boot_n, "_bootobj.rds"))) {
    boot_knots <- boot(data = data, statistic = boot_knot_selection, R = boot_n, response = response, knots = knots)
    boot_knot_ci <- boot.ci(boot_knots, type = "perc") # BCa not computable on discrete data with this tight interval
    saveRDS(boot_knots, paste0(BOOTPATH, response, "_rcs_knots_", boot_n, "_bootobj.rds"))
    saveRDS(boot_knot_ci, paste0(BOOTPATH, response, "_rcs_knot_ci_", boot_n, "_boot_ci.rds"))
} else {
    boot_knots <- readRDS(paste0(BOOTPATH, response, "_rcs_knots_", boot_n, "_bootobj.rds"))
    boot_knot_ci <- readRDS(paste0(BOOTPATH, response, "_rcs_knot_ci_", boot_n, "_boot_ci.rds"))
    }

boot_df <- data.frame(t = boot_knots$t) %>% 
    mutate(within_ci = (t >= boot_knot_ci$percent[4] & t <= boot_knot_ci$percent[5]))

bootknot_ci_p <- ggplot(data = boot_df, aes(x = t, y = ..count../sum(..count..), fill = within_ci)) + 
    geom_bar() +
    theme_minimal() +
    labs(x = "Selected number of knots",
         y = "Frequency",
         fill = "Within CI")

ggsave(paste0(FIGUREPATH, response, "_rcs_boot_knot_distr.pdf"), bootknot_ci_p, width = 8, height = 6, device = cairo_pdf)

bootknot_ci_p

Select optimum number of knots via bootstrapping and AIC
selected_knot_n <- boot_knots$t0

# Final RCS fit
rcs_fit <- ols( as.formula(sprintf("%s ~ rcs(Ferritin, %s)", response, selected_knot_n)), data = data, x = T, y = T)
saveRDS(rcs_fit, paste0(MODELPATH, "rcs_final.rds"))

Bootstrap threshold

Now use selected knot number to bootstrap saddle estimates

Find estimates for saddle point via bootstrapping
# The function boot_saddle  is defined in functions.R
if (!file.exists(paste0(BOOTPATH, response, "_rcs_saddle_", boot_n, "_bootobj.rds"))) {
    boot_saddle <- boot(data = data, statistic = boot_saddle_estimate, R = boot_n, response = response, knot_n = selected_knot_n)
    invalid_indices <- which(boot_saddle$t == 0)
    boot_saddle_ci <- boot.ci(boot_saddle, type = "perc", exclude = invalid_indices)
    saveRDS(boot_saddle, paste0(BOOTPATH, response, "_rcs_saddle_", boot_n, "_bootobj.rds"))
    saveRDS(boot_saddle_ci, paste0(BOOTPATH, response, "_rcs_saddle_ci_", boot_n, "_boot_ci.rds"))
} else {
    boot_saddle <- readRDS(paste0(BOOTPATH, response, "_rcs_saddle_", boot_n, "_bootobj.rds"))
    boot_saddle_ci <- readRDS(paste0(BOOTPATH, response, "_rcs_saddle_ci_", boot_n, "_boot_ci.rds"))
    invalid_indices <- which(boot_saddle$t == 0)
    }

paste0("Bootstrapped saddle point estimate is computable in ", round((1 - length(invalid_indices)/length(boot_saddle$t)) * 100, 2), "% of iterations.")
[1] "Bootstrapped saddle point estimate is computable in 99.9% of iterations."
Find estimates for saddle point via bootstrapping
rcs_final_p <- plot_rcs(data, rcs_fit, n_knot = selected_knot_n, zoom = T, FIGUREPATH = NULL) +
    annotate('rect', xmin = boot_saddle_ci$percent[4], xmax = boot_saddle_ci$percent[5], ymin = -Inf, ymax = Inf, alpha = 0.2, fill = 'black') +
    labs(subtitle = paste0("Saddle point: ", round(boot_saddle_ci$t0, 2), " μg/L [", round(boot_saddle_ci$percent[4], 2), ", ", round(boot_saddle_ci$percent[5], 2), "]"))

ggsave(paste0(FIGUREPATH, response, "_rcs_final.pdf"), rcs_final_p, width = 8, height = 6, device = cairo_pdf)

rcs_final_p

Find estimates for saddle point via bootstrapping
rcs_final_saddle_density_p <-
    ggplot(data = data.frame(t = boot_saddle$t), aes(x = t)) +
    geom_density() +
    theme_minimal() +
    labs(x = "Saddle point",
         y = "Density")

ggsave(paste0(FIGUREPATH, response, "_rcs_final_saddle_density.pdf"), rcs_final_saddle_density_p, width = 8, height = 6, device = cairo_pdf)

rcs_final_saddle_density_p

Bayesian breakpoint analysis

We limit our data to here to Ferritin <= 100, otherwise the 2 breakpoint version will place the second breakpoint way beyond it. This decision is a source of bias.

1 breakpoint

Bayesian breakpoint analysis, 1 breakpoint
concat_data <- data %>% 
    filter(Ferritin <= 100)

if (!file.exists(paste0(MODELPATH, response, "_bayesian_1bp.rds"))) {
    # Specify model
    if (response == "Hemoglobin") {
        baye_1bp_spec <- list(Hemoglobin ~ Ferritin,
                              ~ 0 + Ferritin)
    }
    if (response == "sTfR") {
        baye_1bp_spec <- list(sTfR ~ Ferritin,
                              ~ 0 + Ferritin)
    }
    
    # Fit
    baye_1bp_fit <- mcp(baye_1bp_spec, 
                   data = concat_data,
                   adapt = 500,
                   iter = 500,
                   chains = 4,
                   cores = 4)
    
    saveRDS(baye_1bp_fit, paste0(MODELPATH, response, "_bayesian_1bp.rds"))
} else {
    baye_1bp_fit <- readRDS(paste0(MODELPATH, response, "_bayesian_1bp.rds"))
    }

summary(baye_1bp_fit)
Family: gaussian(link = 'identity')
Iterations: 2000 from 4 chains.
Segments:
  1: Hemoglobin ~ Ferritin
  2: Hemoglobin ~ 1 ~ 0 + Ferritin

Population-level parameters:
       name    mean   lower   upper Rhat n.eff
 Ferritin_1  0.2094  0.1908  0.2303  1.1    10
 Ferritin_2  0.0074  0.0062  0.0088  1.0   323
       cp_1 14.7588 14.0805 15.5867  1.1    39
      int_1 10.1398  9.9456 10.3195  1.1    19
    sigma_1  1.0115  0.9927  1.0304  1.0  1200
Bayesian breakpoint analysis, 1 breakpoint
# Plot
bay1_p <- plot(baye_1bp_fit) +
    theme_minimal() +
    coord_cartesian(xlim = c(0, 100))

posterior_check_p <- pp_check(baye_1bp_fit)

convergence_check_p <- plot_pars(baye_1bp_fit, regex_pars = "cp_")

ggsave(paste0(FIGUREPATH, response, "_bay1bp.pdf"), bay1_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay1bp_ppcheck.pdf"), posterior_check_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay1bp_convcheck.pdf"), convergence_check_p, width = 8, height = 6, device = cairo_pdf)

bay1_p

Bayesian breakpoint analysis, 1 breakpoint
posterior_check_p

Bayesian breakpoint analysis, 1 breakpoint
convergence_check_p

2 breakpoints

Bayesian breakpoint analysis, 2 breakpoints
if (!file.exists(paste0(MODELPATH, response, "_bayesian_2bp.rds"))) {
    # Specify model
    if (response == "Hemoglobin") {
        baye_2bp_spec <- list(Hemoglobin ~ Ferritin,
                              ~ 0 + Ferritin,
                              ~ 0 + Ferritin)
    }
    if (response == "sTfR") {
        baye_2bp_spec <- list(sTfR ~ Ferritin,
                              ~ 0 + Ferritin,
                              ~ 0 + Ferritin)
    }
    
    # Fit
    baye_2bp_fit <- mcp(baye_2bp_spec, 
                   data = concat_data,
                   adapt = 500,
                   iter = 500,
                   chains = 4,
                   cores = 4)
    
    saveRDS(baye_2bp_fit, paste0(MODELPATH, response, "_bayesian_2bp.rds"))
} else {
    baye_2bp_fit <- readRDS(paste0(MODELPATH, response, "_bayesian_2bp.rds"))
}

summary(baye_2bp_fit)
Family: gaussian(link = 'identity')
Iterations: 2000 from 4 chains.
Segments:
  1: Hemoglobin ~ Ferritin
  2: Hemoglobin ~ 1 ~ 0 + Ferritin
  3: Hemoglobin ~ 1 ~ 0 + Ferritin

Population-level parameters:
       name    mean   lower   upper Rhat n.eff
 Ferritin_1  1.0863  0.5739  1.5087 10.5     9
 Ferritin_2  0.1500  0.1193  0.1761  2.0    21
 Ferritin_3  0.0068  0.0051  0.0082  1.0   230
       cp_1  4.4437  3.4641  6.0420  6.1    15
       cp_2 16.9235 15.6629 18.8014  1.4    30
      int_1  6.8142  5.7630  8.3873  8.5    13
    sigma_1  1.0052  0.9867  1.0239  1.0  1642
Bayesian breakpoint analysis, 2 breakpoints
# Plot
bay2_p <- plot(baye_2bp_fit) +
    theme_minimal() +
    coord_cartesian(xlim = c(0, 100))

posterior_check_p <- pp_check(baye_2bp_fit)

convergence_check_p <- plot_pars(baye_2bp_fit, regex_pars = "cp_")

ggsave(paste0(FIGUREPATH, response, "_bay2bp.pdf"), bay2_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay2bp.pdf"), posterior_check_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay2bp.pdf"), convergence_check_p, width = 8, height = 6, device = cairo_pdf)

bay2_p

Bayesian breakpoint analysis, 2 breakpoints
posterior_check_p

Bayesian breakpoint analysis, 2 breakpoints
convergence_check_p

Play with scam

scamfit <- scam(Hemoglobin ~ s(Ferritin, k = 8, bs = "mpi"), data = concat_data)
plot(scamfit, residuals = T)

From the supplement of Galetti et al. (2021):

To identify thresholds, we then used the method of finite differences to estimate the first derivatives of the fitted trend spline from the generalized additive modelling smoother and we identified the periods in which the estimated rate of change between fitted y and x was statistically significant (i.e. statistically different from zero) by computing the uncertainty around the estimates from 10 000 posterior simulations. At the point on the x-axis where 95% of the posterior simulations are all different from zero for the first derivative, we could identify where the slope between fitted y and x began to be signiicantly positive (or negative).

The issue currently is that we are deliberately choosing to shape restrict the fit to a monotonically increasing curve \(\rightarrow\) the derivative can never cross zero, so we’d have to choose an arbitrary number “close to zero” instead, which will be difficult to justify.

d1 <- derivative.scam(scamfit)
xx <- sort(concat_data$Ferritin, index = TRUE)
plot(xx$x, d1$d[xx$ix], type = "l")